global indata "/.../data"
set more off
set linesize 255

*-------------------------------------------------------------------------------
* Author: Dhiren Patki
*
* This program creates firm-level panel data using F5500 and LBD/BR data.
* These panels are used to pull worker-level data in 04_to_LEHD_ee.sas 
* and 05_agestruc_all.sas
*-------------------------------------------------------------------------------

*-------------------------------------------------------------------------------
* Create firm panel
*-------------------------------------------------------------------------------

import delimited using $indata/firmszpay_m.csv, clear stringcols(1 22)
sort firmid year

*Populate first_firmid when missing (i.e. when 5500 is not filed)
forvalues k = 1/10{
	replace first_firmid=first_firmid[_n-`k'] if ///
	firmid==firmid[_n-`k'] & _f5500_filed==0 & first_firmid==""
	replace first_firmid=first_firmid[_n+`k'] if ///
	firmid==firmid[_n+`k'] & _f5500_filed==0 & first_firmid==""
}

sort first_firmid year

*Create a list of first_firmid's that are seen consecutively in the LBD for each expt panel
forvalues c = 2001/2014{
	preserve
	
		* Define the sample
		gen group = `c'
		gen yr_first_trt= min(yr_first_frozen,yr_first_cb)		
		bysort first_firmid: egen temp = max(yr_first_trt)
		replace yr_first_trt = temp if yr_first_trt==.
		drop temp
		keep if yr_first_trt==. | yr_first_trt>=`c'
		*Must be observed in year == group
		gen helper= (year==group)
		bysort first_firmid: egen helper2 = max(helper)
		keep if helper2==/*redacted*/
		drop helper helper2
		
		*Keep firms seen consecutively in the LBD (any successor to first firmid)
		bysort first_firmid: egen firstyr = min(year)
		sort first_firmid year
		bysort first_firmid: gen k = _n-1
		gen firmseq = firstyr+k
		gen missing_flag= (year!=firmseq)
		bysort first_firmid: egen missing_yr_firm = max(missing_flag)
		drop if missing_yr_firm==/*redacted*/
		drop firstyr firmseq k missing_flag missing_yr_firm
		
		keep first_firmid group
		duplicates drop
		save $indata/consecLBD_m_c`c', replace
	restore
}

*Now make the main panel

*Destring accounting variables
local var "tot_db_income tot_db_emplee_contrib tot_db_emplr_contrib"
foreach v of local var{
	gen temp = subinstr(`v',",","",.)
	drop `v'
	destring temp, gen(`v')
	drop temp
}

*Interpolate missing DB actuarial variables
sort first_firmid year
local var1 "tot_db_contrib plan_assets plan_liab actives_liab rtd_liab"
local var2 "rtd_liab xpct_liab_incr tot_db_income tot_db_emplee_contrib tot_db_emplr_contrib"
local var "`var1' `var2'"

forvalues i = 1/5{
    foreach v of local var{

	    replace `v' = (`v'[_n+1]+`v'[_n-1])/2 if `v'==. ///
	    & first_firmid == first_firmid[_n-1] 
    }
}

replace tot_db_contrib = tot_db_emplee_contrib + tot_db_emplr_contrib if tot_db_contrib==.

*Merge with CPI data
merge m:1 year using $indata/bls_cpi
drop if _merge==2
drop _merge 

replace cov_rate_firmwide = firmwide_partcp/size if cov_rate_firmwide==.
replace firmwide_adj_partcp = firmwide_partcp if firmwide_adj_partcp==.

*Convert nominal dollar values into real 2010 terms
replace plan_assets=plan_assets*-1 if plan_assets<0
replace plan_liab = plan_liab*-1 if plan_liab<0
local var1 "tot_db_contrib tot_db_income tot_db_emplee_contrib tot_db_emplr_contrib"
local var2 "plan_assets plan_liab actives_liab rtd_liab xpct_liab_incr epayroll_br"
local var "`var1' `var2'"
foreach v of local var{
	replace `v' = `v'*annual_cpi
}

*Clean up yr first frozen/cb tag (identify plans by the first firmid)
bysort first_firmid: egen temp =max(yr_first_frozen)
replace yr_first_frozen = temp if yr_first_frozen==.
drop temp
bysort first_firmid: egen temp =max(yr_first_cb)
replace yr_first_cb = temp if yr_first_cb==.
drop temp

*Edit the coverage rate
gen cov_rate_firmwide_w = cov_rate_firmwide
replace cov_rate_firmwide_w = 1 if cov_rate_firmwide>1 & cov_rate_firmwide!=.


*Create experiment year panels 

forvalues c = 2001/2014{

	preserve

		* Define the sample
		gen group = `c'
		gen yr_first_trt= min(yr_first_frozen,yr_first_cb)		
		bysort first_firmid: egen temp = max(yr_first_trt)
		replace yr_first_trt = temp if yr_first_trt==.
		drop temp
		keep if yr_first_trt==. | yr_first_trt>=`c'
		
		*Keep firms that are observed consecutively in the LBD
		merge m:1 first_firmid using $indata/consecLBD_m_c`c', keepus(first_firmid)
		keep if _merge==3
		drop _merge

		*Keep firms seen 5 years pre-event in LBD and employment is not missing
		gen t = year - group 
		bysort first_firmid: egen min_t = min(t)
		replace min_t = . if first_firmid==/*redacted*/
		gen missing_flag=0
		replace missing_flag=1 if lbd_emp==. &  year<=`c'
		bysort first_firmid: egen missing_yr_firm = max(missing_flag)
		drop if missing_yr_firm==1
		drop if min_t>-5
		drop if t<-5
		drop t min_t missing_flag missing_yr_firm
		
		*Keep firms that are seen 5 consecutive years pre-event/incl group yr in F5500 	
		bysort first_firmid: egen first_yr = min(plan_year_end)
		sort first_firmid plan_year_end
		bysort first_firmid: gen k = _n-1
		gen firmseq = first_yr+k
		gen missing_flag= (plan_year_end!=firmseq & plan_year_end<=group)
		replace missing_flag = 1 if _f5500_filed==0 & year<=group
		bysort first_firmid: egen missing_yr_firm = max(missing_flag)
		drop if missing_yr_firm==1
		drop missing_yr_firm firmseq missing_flag k
		
		gen t = plan_year_end - group
		bysort first_firmid: egen min_t = min(t)
		drop if min_t>-5 
		drop min_t t 
		
		*Winsorize some variables at 1 and 99th percentile
		gen av_earn = lbd_pay*1000/lbd_emp
		local var1 "tot_db_contrib tot_db_income tot_db_emplee_contrib tot_db_emplr_contrib"
		local var2 "plan_assets plan_liab actives_liab rtd_liab xpct_liab_incr av_earn"
		local var "`var1' `var2'"
		foreach v of local var{
			qui sum `v', d
			gen `v'_w = `v'
			replace `v'_w = r(p99) if `v'_w>r(p99) & `v'_w!=.
			replace `v'_w = r(p1) if `v'<r(p1) & `v'_w!=.
		}

		gen funding_ratio = plan_assets_w/plan_liab_w

		local var "funding_ratio"
		foreach v of local var{
			qui sum `v', d
			gen `v'_w = `v'
			replace `v'_w = r(p99) if `v'_w>r(p99) & `v'_w!=.
			replace `v'_w = r(p1) if `v'<r(p1) & `v'_w!=.
		}

		*rescale vars using inv hyperbolic sin or ln
		gen log_size = asinh(lbd_emp)
		gen log_funding_ratio = ln(funding_ratio_w)
		gen log_assets = ln(plan_assets_w)
		gen log_liab = ln(plan_liab_w)
		gen liab_active_partcp = actives_liab_w/firmwide_adj_partcp
		gen log_liab_active_p = asinh(liab_active_partcp)
		gen incr_liab_partcp = xpct_liab_incr_w/firmwide_adj_partcp
		gen log_incr_liab_active_p = asinh(incr_liab_partcp)
		gen log_av_earn = ln(av_earn_w)
		
		save $indata/firmpanel_m_c`c', replace

	restore
}

*Stack the panels
use $indata/firmpanel_m_c2001, clear
forvalues c=2002/2014{
	append using $indata/firmpanel_m_c`c'
	save $indata/firmpanel_m, replace
}

*Update firm age when missing
gen temp = plan_year_end - firmage
bysort first_firmid: egen birthyr = max(temp)
drop temp
replace firmage = plan_year_end - birthyr if firmage==.

*Update plan retirement age when missing
sort group first_firmid plan_year_end
forvalues i = 1/20{
	bysort group firmid: gen temp1 = plan_r_age[_n-1]
	replace temp1 = 0 if temp1==.
	bysort group firmid: gen temp2 = plan_r_age[_n+1]
	replace temp2 = 0 if temp2==.
	gen temp3= (temp1!=0)
	gen temp4= (temp2!=0)
	gen imputed_r_age = round((temp1+temp2)/(temp3+temp4))
	replace plan_r_age = imputed_r_age if plan_r_age==. 
	drop temp* imputed_r_age
}

*Create pre-event coverage rate flag
gen k = year - group
bysort group first_firmid: egen av_cov_rate_pre = mean(cov_rate_firmwide_w) if inrange(k,/*redacted*/)
bysort group first_firmid: egen temp = max(av_cov_rate_pre)
replace av_cov_rate_pre = temp if av_cov_rate_pre==.
*Redacted: indicator abv_cov_cut := av_cov_rate_pre > threshold 
drop temp

*Diagnostics
tab k 
tab k abv_cov_cut
tab k abv_cov_cut if treat_flag

*Pre-event diagnostics
gen k2= plan_year_end - group
tab k2 if plan_year_end<=group
tab k2 abv_cov_cut if plan_year_end<=group
tab k2 abv_cov_cut if plan_year_end<=group & treat_flag
drop k2

save $indata/firmpanel_m, replace

*---------------------------------------------------------------------------------
* Define two sample of firms to look for in LEHD ECF
* 1. Coverage rate above threshold from c-5 to c-2, all firms alive in c-5
* 2. Unrestricted sample (firmszpanel)
*---------------------------------------------------------------------------------


use $indata/firmpanel_m, clear

*Diagnostics
tabstat lbd_emp, by(group)
tabstat mu, by(group)

*Restricted (above cut) sample to merge with LEHD for person level data
preserve

	keep if abv_cov_cut & year == group-5
	keep firmid year treat_flag group
	export delimited $indata/abvcut_firms.csv, replace datafmt 

restore

*Unrestricted sample, unique firms (all firm id-years observed pre-event) -- for age structure file
preserve
	keep if year<=group
	keep firmid year  
	duplicates drop 
	export delimited $indata/unqfirmspanel_pre.csv, replace datafmt 
restore

*Unrestricted file (all group-firm id-years observed pre-expt year) -- to check quality of LEHD merge
preserve
	keep if year<=group
	keep firmid year group 
	duplicates drop 
	export delimited $indata/allfirmspanel_pre.csv, replace datafmt 
restore

*Unrestricted sample, unique firms (all firm id-years) -- for age structure file
preserve
	keep firmid year  
	duplicates drop 
	export delimited $indata/unqfirmspanel.csv, replace datafmt 
restore

*Unrestricted file (all group-firm id-years) -- to check quality of LEHD merge
preserve
	keep firmid year group 
	duplicates drop 
	export delimited $indata/allfirmspanel.csv, replace datafmt 
restore

*-----------------------------------------------------------
* File to merge with worker panel 
*-----------------------------------------------------------

*Age structure file from LEHD
import delimited using $indata/firmagestruc_a.csv, clear stringcols(1)
save $indata/firmagestruc_a, replace

*Create a version of fimpanel with the age structure vars attached 
use $indata/firmpanel_m, clear

*Merge with age structure file
merge m:1 firmid year using $indata/firmagestruc_a
*Flag all periods in which a firm in not found in the LEHD
gen pre_period = (k<=0)
*Count number of pre-period obs for a first_firmid-group
bysort group first_firmid: egen pre_obs=sum(pre_period)
*Count number of merged pre-period obs for a firm-group
gen merge_pre= (_merge==3 & pre_period==/*redacted*/)
bysort group first_firmid: egen merged_pre_obs = sum(merge_pre)

*Flag for group-first_firmid's to keep
gen _mergeok= (merged_pre_obs==pre_obs)
keep if _mergeok==/*redacted*/
drop merge_pre merged_pre_obs pre_obs pre_period _merge

*Diagnostics
tab k 
tab k abv_cov_cut
tab k abv_cov_cut if treat_flag==/*redacted*/

*Replace 2-digit industry by modal industry for the firm if missing, else 0
gen mnaics2_e = mnaics2
replace mnaics2_e = 0 if mnaics2==.
gen ind_2_t = mnaics2
replace ind_2_t = . if ind_2_t==0
bysort group first_firmid: egen temp=mode(ind_2_t), maxmode
replace ind_2_t=temp if ind_2_t==.
drop temp
replace mnaics2_e=ind_2_t if mnaics2_e==0 & ind_2_t!=.
drop ind_2_t

*Replace 3-digit industry by modal industry for the firm if missing, else 0
tostring mnaics4, gen(mnaics4_c)
gen mnaics3_c = substr(mnaics4_c,1,3)
destring mnaics3_c, gen(mnaics3)
drop mnaics4_c mnaics3_c
gen mnaics3_e = mnaics3
replace mnaics3_e = 0 if mnaics3==.
gen ind_3_t = mnaics3
replace ind_3_t = . if ind_3_t==0
bysort group first_firmid: egen temp=mode(ind_3_t), maxmode
replace ind_3_t=temp if ind_3_t==.
drop temp
replace mnaics3_e=ind_3_t if mnaics3_e==0 & ind_3_t!=.
drop ind_3_t

*firm age as of c-5
gen firmage_cm5 = firmage if k ==/*redacted*/
bysort group first_firmid: egen temp = max(firmage_cm5)
replace firmage_cm5 = temp if firmage_cm5==.
drop temp

save $indata/firmpanel_LEHD_m, replace

*Clean up input ds
forvalues c = 2001/2014{
	erase $indata/consecLBD_m_c`c'.dta
	erase $indata/firmpanel_m_c`c'.dta
}

